{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from keras.preprocessing import text, sequence\n",
    "import numpy as np\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_features = 20000\n",
    "maxlen = 100\n",
    "\n",
    "train = pd.read_csv(\"./cache/train-processed.csv\")\n",
    "test = pd.read_csv(\"./cache/predict-processed.csv\")\n",
    "\n",
    "list_sentences_train = train.Discuss.values\n",
    "list_sentences_test = test.Discuss.values\n",
    "\n",
    "y = train.Score.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = text.Tokenizer(num_words=max_features)\n",
    "tokenizer.fit_on_texts(list(list_sentences_train))\n",
    "list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)\n",
    "list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)\n",
    "X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)\n",
    "X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)\n",
    "\n",
    "EMBEDDING_FILE = './cache/char_vector_128.txt'\n",
    "\n",
    "\n",
    "def get_coefs(word, *arr):\n",
    "    return word, np.asarray(arr, dtype='float32')\n",
    "\n",
    "embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE, encoding='utf-8'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14938\n",
      "-0.0214866 1.66043\n"
     ]
    }
   ],
   "source": [
    "# 一个小bug\n",
    "print(len(embeddings_index))\n",
    "del embeddings_index[str(len(embeddings_index) - 1)]\n",
    "\n",
    "all_embs = np.stack(embeddings_index.values())\n",
    "emb_mean, emb_std = all_embs.mean(), all_embs.std()\n",
    "print(emb_mean, emb_std)\n",
    "\n",
    "word_index = tokenizer.word_index\n",
    "nb_words = min(max_features, len(word_index))\n",
    "embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, 128))\n",
    "for word, i in word_index.items():\n",
    "    if i >= max_features: continue\n",
    "    embedding_vector = embeddings_index.get(word)\n",
    "    if embedding_vector is not None: embedding_matrix[i] = embedding_vector\n",
    "\n",
    "\n",
    "x_train, x_val, y_train, y_val = train_test_split(X_t, y, test_size=0.1, random_state=42)\n",
    "\n",
    "# 保存中间文件，模型调用\n",
    "np.savez(\"./cache/data.npz\", x_train, y_train, x_val, y_val, X_te, embedding_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
